evalutions of pseudobulk rna-seq samples
# data quality
mets_only_tcga = True
gene_sparsity_ceiling_tcga = 0.5
# pseudobulk generation parameters
n_cells_per_cell_type = 10
malignant_from_one_sample = True
import pathlib
figure_path = pathlib.Path("figures-8e")
figure_path.mkdir(parents=True, exist_ok=True)
figure_path
PosixPath('figures-8e')
import logging
import helpers
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go
import plotly.io
# plotly.io.renderers.default = "plotly_mimetype+notebook_connected"
handler = logging.StreamHandler()
formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
handler.setFormatter(formatter)
logging.getLogger().addHandler(handler)
logging.getLogger("gcsfs").setLevel("INFO")
logging.getLogger("google.cloud.bigquery").setLevel("DEBUG")
logging.getLogger("helpers").setLevel("DEBUG")
logging.getLogger("helpers.creating_mixtures").setLevel("INFO")
logging.getLogger("pandas").setLevel("DEBUG")
logging.getLogger("pyarrow").setLevel("DEBUG")
logger = logging.getLogger(__name__)
logger.setLevel("DEBUG")
rng = np.random.default_rng(seed=0)
%%time
bulk_tcga_skcm = helpers.datasets.load_tcga_skcm_hg19_scaled_estimate_firebrowse()
bulk_tcga_skcm *= 1_000_000 / bulk_tcga_skcm.sum()
2022-05-20 20:34:35,057 - helpers.datasets - DEBUG - reading gs://liulab/firebrowse.org/SKCM.rnaseqv2__illuminahiseq_rnaseqv2__unc_edu__Level_3__RSEM_genes__data.data.txt
CPU times: user 13.4 s, sys: 1.4 s, total: 14.8 s Wall time: 22.9 s
%%time
fractions_tcga_skcm = helpers.datasets.load_tcga_skcm_fractions_from_csx()
2022-05-20 20:34:57,984 - helpers.datasets - DEBUG - loading TCGA SKCM fractions estimated by CIBERSORTx
CPU times: user 12 ms, sys: 3.96 ms, total: 15.9 ms Wall time: 90.7 ms
%%time
sc_jerby_arnon, sc_metadata_jerby_arnon = helpers.datasets.load_jerby_arnon(
ref_genome="hg19", units="tpm"
)
sc_jerby_arnon *= 1_000_000 / sc_jerby_arnon.sum()
2022-05-20 20:34:58,082 - helpers.datasets - DEBUG - loading Jerby-Arnon scRNA-seq data 2022-05-20 20:35:31,823 - helpers.datasets - DEBUG - loading Jerby-Arnon metadata
CPU times: user 1min 10s, sys: 16.9 s, total: 1min 27s Wall time: 34.7 s
# determine gene exclusions
exclusions_genes = pd.DataFrame(index=bulk_tcga_skcm.index.union(sc_jerby_arnon.index))
## sparsity in tcga
genes_sparse_tcga_skcm = bulk_tcga_skcm[
(bulk_tcga_skcm == 0).mean(axis=1) > gene_sparsity_ceiling_tcga
].index
exclusions_genes["sparse_in_tcga_skcm"] = exclusions_genes.index.map(
lambda g: g in genes_sparse_tcga_skcm
)
## genes not in both cohorts
genes_in_both = bulk_tcga_skcm.index.intersection(sc_jerby_arnon.index)
exclusions_genes["not_in_both_cohorts"] = exclusions_genes.index.map(
lambda gene_name: gene_name not in genes_in_both
)
good_genes = exclusions_genes.loc[~exclusions_genes.any(axis=1)].index
# determine tcga sample exclusions
exclusions_tcga_samples = pd.DataFrame(index=bulk_tcga_skcm.columns)
## limit to metastases
if mets_only_tcga:
query_text = """
SELECT aliquot_barcode
FROM `keen-dispatch-316219.gdc_tcga_skcm_subset.aliquot2caseIDmap_current`
WHERE sample_type_name = 'Metastatic'
"""
metastatic_aliquot_barcodes = pd.read_gbq(query_text)["aliquot_barcode"].values
exclusions_tcga_samples["is_not_metastatic"] = exclusions_tcga_samples.index.map(
lambda sample: sample not in metastatic_aliquot_barcodes
)
good_tcga_samples = exclusions_tcga_samples.loc[
~exclusions_tcga_samples.any(axis=1)
].index
# apply exclusions
sc_jerby_arnon_cleaned = sc_jerby_arnon.loc[good_genes]
sc_jerby_arnon_cleaned *= 1_000_000 / sc_jerby_arnon_cleaned.sum()
bulk_tcga_skcm_cleaned = bulk_tcga_skcm.loc[good_genes][good_tcga_samples]
bulk_tcga_skcm_cleaned *= 1_000_000 / bulk_tcga_skcm_cleaned.sum()
fractions_tcga_skcm_cleaned = fractions_tcga_skcm.loc[good_tcga_samples]
print(
"without exclusions:",
[df.shape for df in (sc_jerby_arnon, bulk_tcga_skcm, fractions_tcga_skcm)],
)
print(
"with exclusions:",
[
df.shape
for df in (
sc_jerby_arnon_cleaned,
bulk_tcga_skcm_cleaned,
fractions_tcga_skcm_cleaned,
)
],
)
without exclusions: [(23686, 7186), (20501, 473), (473, 9)] with exclusions: [(16063, 7186), (16063, 368), (368, 9)]
bulk_pseudo, bulk_pseudo_cell_type_geps = helpers.creating_mixtures.make_mixtures(
sc_jerby_arnon_cleaned,
sc_metadata_jerby_arnon,
fractions_tcga_skcm_cleaned.rename(index=lambda sample: f"pseudo_like_{sample}"),
n_cells_per_gep=n_cells_per_cell_type,
normalization_factor=1_000_000,
malignant_from_one_sample=malignant_from_one_sample,
rng=rng,
)
def calc_gene_sparsity(df):
return (df == 0).mean(axis=1)
def calc_gene_density(df):
return (df != 0).mean(axis=1)
x = calc_gene_sparsity(bulk_tcga_skcm)
fig = go.Figure(
data=[
go.Histogram(
x=x,
xbins=dict(start=0.0, end=1.0, size=0.1),
)
]
)
fig.update_layout(bargap=0.1)
fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[0, 20_000])
fig.update_layout(
title="Gene sparsity of TCGA SKCM bulk RNA-seq (hg19 TPM) <br>(original data)",
xaxis_title="sparsity (binned)",
yaxis_title="count of genes",
)
fig.write_image(
figure_path / "bulk_tcga_skcm_sparsity.png", scale=2.0, width=500, height=500
)
fig
x = calc_gene_sparsity(bulk_tcga_skcm_cleaned)
fig = go.Figure(
data=[
go.Histogram(
x=x,
xbins=dict(start=0.0, end=1.0, size=0.1),
)
]
)
fig.update_layout(bargap=0.1)
fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[0, 20_000])
fig.update_layout(
title="Gene sparsity of TCGA SKCM bulk RNA-seq (hg19 TPM) <br>(metastatic only, excl >50% sparse genes)",
xaxis_title="sparsity (binned)",
yaxis_title="count of genes",
)
fig.write_image(
figure_path / "bulk_tcga_skcm_cleaned_sparsity.png",
scale=2.0,
width=500,
height=500,
)
fig
fig = go.Figure(
data=[
go.Histogram(
x=x,
xbins=dict(start=-2, end=5, size=0.5),
)
]
)
fig.update_layout(bargap=0.1)
fig.update_xaxes(range=[-2, 5])
# fig.update_yaxes(range=[0, 10_000])
fig.update_layout(
title="Distribution of mean gene expression: TCGA SKCM bulk RNA-seq <br>(hg19 TPM, metastatic only, excl >50% sparse genes)",
xaxis_title="Gene expression (binned, log10-transformed)",
yaxis_title="count of genes",
)
fig.write_image(
figure_path / "bulk_tcga_skcm_cleaned_dist_mean.png",
scale=1.5,
width=800,
height=800,
)
fig
import plotly.figure_factory as ff
x = np.log(bulk_tcga_skcm_cleaned.mean(axis="columns")) / np.log(10)
hist_data = [x]
group_labels = ["TCGA SKCM bulk RNA-seq"] # name of the dataset
fig = ff.create_distplot(hist_data, group_labels, bin_size=0.2)
fig.update_layout(bargap=0.1, title="Distribution of mean gene expression")
fig.update_xaxes(showgrid=True, title="Mean gene expression (log10 scale)")
fig.update_yaxes(showgrid=True)
fig.update_layout(legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01))
fig.write_image(
figure_path / "bulk_tcga_skcm_cleaned_dist_mean__create_distplot.png",
scale=1.5,
width=800,
height=800,
)
fig
x = calc_gene_sparsity(sc_jerby_arnon)
fig = go.Figure(
data=[
go.Histogram(
x=x,
xbins=dict(start=0.0, end=1.0, size=0.1),
)
]
)
fig.update_layout(bargap=0.1)
fig.update_xaxes(range=[0, 1])
fig.update_yaxes(range=[0, 20_000])
fig.update_layout(
title="Gene sparsity of Jerby-Arnon scRNA-seq (hg19 TPM)",
xaxis_title="sparsity (binned)",
yaxis_title="count of genes",
)
fig.write_image(
figure_path / "sc_jerby_arnon_sparsity.png", scale=2.0, width=500, height=500
)
fig
import altair as alt
alt.data_transformers.disable_max_rows()
DataTransformerRegistry.enable('default')
x = calc_gene_sparsity(bulk_tcga_skcm).to_frame(name="sparsity").reset_index()
alt.Chart(x).mark_bar().encode(
alt.X(
"sparsity:Q",
bin=alt.Bin(extent=[0, 1]),
),
y="count()",
)
x = (
calc_gene_sparsity(bulk_tcga_skcm.loc[good_genes][good_tcga_samples])
.to_frame(name="sparsity")
.reset_index()
)
alt.Chart(x).mark_bar().encode(
alt.X(
"sparsity:Q",
bin=alt.Bin(extent=[0, 1]),
),
y="count()",
)
x = calc_gene_sparsity(sc_jerby_arnon).to_frame(name="sparsity").reset_index()
chart = (
alt.Chart(x)
.mark_bar()
.encode(
alt.X(
"sparsity:Q",
bin=alt.Bin(extent=[0, 1]),
),
y="count()",
)
)
chart.save(figure_path / "sc_jerby_arnon_sparsity.png", scale_factor=5.0)
chart